{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from datetime import date,time,timedelta\n",
    "\n",
    "path_cases = 'https://moh-malaysia-covid19.s3.ap-southeast-1.amazonaws.com/linelist_cases.parquet'\n",
    "path_param_geo = 'https://raw.githubusercontent.com/MoH-Malaysia/covid19-public/main/epidemic/linelist/param_geo.csv'\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>date</th>\n",
       "      <th>state</th>\n",
       "      <th>district</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2020-01-25</td>\n",
       "      <td>Johor</td>\n",
       "      <td>Johor Bahru</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2020-01-25</td>\n",
       "      <td>Johor</td>\n",
       "      <td>Johor Bahru</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2020-01-25</td>\n",
       "      <td>Johor</td>\n",
       "      <td>Johor Bahru</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2020-01-25</td>\n",
       "      <td>Johor</td>\n",
       "      <td>Johor Bahru</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2020-01-29</td>\n",
       "      <td>Johor</td>\n",
       "      <td>Johor Bahru</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         date  state     district\n",
       "0  2020-01-25  Johor  Johor Bahru\n",
       "1  2020-01-25  Johor  Johor Bahru\n",
       "2  2020-01-25  Johor  Johor Bahru\n",
       "3  2020-01-25  Johor  Johor Bahru\n",
       "4  2020-01-29  Johor  Johor Bahru"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "geo = pd.read_csv(path_param_geo)\n",
    "idxd_district = dict(zip(geo.idxd, geo.district))\n",
    "idxs_state = dict(zip(geo.idxs, geo.state))\n",
    "district_state = dict(zip(geo.district, geo.state))\n",
    "\n",
    "df = pd.read_parquet(path_cases, columns=['date', 'state', 'district'])\n",
    "df['state'] = df.state.map(idxs_state)\n",
    "df['district'] = df.district.map(idxd_district)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>date</th>\n",
       "      <th>state</th>\n",
       "      <th>district</th>\n",
       "      <th>cases</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2020-01-25</td>\n",
       "      <td>Johor</td>\n",
       "      <td>Batu Pahat</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2020-01-25</td>\n",
       "      <td>Johor</td>\n",
       "      <td>Johor Bahru</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2020-01-25</td>\n",
       "      <td>Johor</td>\n",
       "      <td>Kluang</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2020-01-25</td>\n",
       "      <td>Johor</td>\n",
       "      <td>Kota Tinggi</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2020-01-25</td>\n",
       "      <td>Johor</td>\n",
       "      <td>Kulai</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         date  state     district  cases\n",
       "0  2020-01-25  Johor   Batu Pahat      0\n",
       "1  2020-01-25  Johor  Johor Bahru      4\n",
       "2  2020-01-25  Johor       Kluang      0\n",
       "3  2020-01-25  Johor  Kota Tinggi      0\n",
       "4  2020-01-25  Johor        Kulai      0"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = df[~df.district.isnull()].copy()\n",
    "df.date = pd.to_datetime(df.date)\n",
    "df['cases'] = 1\n",
    "\n",
    "df = df.groupby(['date','district'])\\\n",
    "    .sum()\\\n",
    "    .unstack(fill_value=0)\\\n",
    "    .asfreq('D', fill_value=0)\\\n",
    "    .stack()\\\n",
    "    .sort_index(level=1)\\\n",
    "    .reset_index()\n",
    "df.date = df.date.dt.date\n",
    "df['state'] = df.district.map(district_state)\n",
    "df = df[['date','state','district','cases']]\n",
    "df = df.sort_values(by=['date','state','district']).reset_index(drop=True)\n",
    "df.head()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.9.6 ('pipenv')",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.6"
  },
  "orig_nbformat": 4,
  "vscode": {
   "interpreter": {
    "hash": "8a4f38b56e3c9a1ab49544e14927fe9a7f69658154cfdca06974f05e1883f381"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
